Tutorial Brief

Preprocessing is the transformations to your data the happens before training your model. This includes converting your data to its basic numerical components.

Video Tutorial:

http://youtu.be/NWp6DFtnqYk

Data Set

My Youtube stats as of Sep 29, 2014.

Columns:

  • Video
  • Upload Time Video length (minutes)
  • Views Estimated minutes watched
  • Average view duration (minutes)
  • Average percentage viewed
  • Subscriber views
  • Subscriber minutes watched
  • Likes
  • Dislikes
  • Shares
  • Comments
  • Favorites
  • Subscribers

Import Libraries


In [1]:
# Core Libraries
import os
from fnmatch import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.html import widgets
from IPython.html.widgets import interact
from IPython.display import display
import urllib2
from datetime import datetime

# Feature Extraction
from sklearn.feature_extraction import DictVectorizer

# Preprocessing
from sklearn import preprocessing

# External
from sklearn.externals import joblib

# Hide Warnings
import warnings
warnings.filterwarnings('ignore')

# Configure Pandas
pd.options.display.max_columns = 100
pd.options.display.width = 120

Loading Data


In [4]:
success_alert = """
<div class="alert alert-success" role="alert">Loading data from %s was successful.</div>
"""

error_alert = """
<div class="alert alert-danger" role="alert">Error loading data from %s. %s</div>
"""


def load_data(widget):
    global csv_data
    path = wgt_file_location.value
    try:
        if wgt_header.value:
            csv_data = pd.read_csv(path, sep=wgt_separator.value)
        else:
            csv_data = pd.read_csv(path, sep=wgt_separator.value, names=wgt_manual_header.value.split(","))
        wgt_alert.value = success_alert % path
    except Exception as ex:
        print ex
        print path
        wgt_alert.value = error_alert % (path, ex)
    wgt_alert.visible = True
    

def preview_file(widget):
    path = wgt_file_location.value
    if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
        raw_file = urllib2.urlopen(path)
        wgt_file_preview.value = "<pre>%s</pre>" % raw_file.read(1000)
        raw_file.close()
    else:
        raw_file = open(path)
        wgt_file_preview.value = "<pre>%s</pre>" % raw_file.read(1000)
        raw_file.close()
    

def manual_columns(name,old,new):
    wgt_manual_header.visible = old
    
def update_path(name,old,new):
    wgt_file_location.value = new
    
    
def load_files(widget):
    files_list = {}
    root = os.curdir
    patterns = ["*.txt", "*.csv"]

    for path, subdirs, files in os.walk(root):
        for name in files:
            for pattern in patterns:
                if fnmatch(name, pattern):
                    files_list[os.path.join(path, name)] = os.path.join(path, name)
    widget.values = files_list

container = widgets.ContainerWidget()

wgt_alert = widgets.HTMLWidget()
wgt_file_location = widgets.TextWidget(description="Path/URL:")
wgt_file_path = widgets.DropdownWidget(description="Files List")
wgt_separator = widgets.TextWidget(description="Separator", value=",")
wgt_header = widgets.CheckboxWidget(description="First columns is a header?", value=True)
wgt_manual_header = widgets.TextWidget(description="Columns seperated by commas", visible=False)
wgt_load_data = widgets.ButtonWidget(description="Load Data")
wgt_preview_file = widgets.ButtonWidget(description="Preview File")
wgt_file_preview = widgets.HTMLWidget()

wgt_alert.visible = False

wgt_load_data.on_click(load_data)
wgt_preview_file.on_click(preview_file)
wgt_file_path.on_displayed(load_files)
wgt_file_path.on_trait_change(update_path, "value")
wgt_header.on_trait_change(manual_columns, "value")

container.children = (wgt_alert, wgt_file_path, wgt_file_location, wgt_separator, wgt_header, wgt_manual_header,
                      wgt_load_data, wgt_preview_file, wgt_file_preview)

display(container)

Type Conversion


In [5]:
success_alert = """
<div class="alert alert-success" role="alert">Type conversion was successful.</div>
"""

error_alert = """
<div class="alert alert-danger" role="alert">Error in type conversion. %s.</div>
"""

def get_stats(column):
    series = pd.Series(csv_data[column].values.ravel())
    stats = "Max:<span class='badge'>%s</span>, Min:<span class='badge'>%s</span>,"
    stats += "Avg:<span class='badge'>%s</span>, Median:<span class='badge'>%s</span>"
    if str(series.dtype) in ["int32", "int64", "float32", "float64"]:
        return stats % (series.max(), series.min(), series.mean(), series.median())
    else:
        return "Not numerical"


def get_column_type(column):
    column_type = str(pd.Series(csv_data[column].values.ravel()).dtype)
    if column_type == "int64":
        return "Int"
    elif column_type == "int32":
        return "Int"
    elif column_type == "float32":
        return "Float"
    elif column_type == "float64":
        return "Float"
    elif column_type == "object":
        return "Object"

def process_column(column):
    column_name = column.children[1].value
    data_type = column.children[2].value
    if data_type == "Float":
        csv_data[column_name] = csv_data[column_name].astype(np.float64)
    elif data_type == "Int":
        csv_data[column_name] = csv_data[column_name].astype(np.int64)
    elif data_type == "Ordinal Date":
        csv_data[column_name + "_date"] = csv_data[column_name].apply(datetime.fromordinal)
    elif data_type == "Text Date":
        csv_data[column_name + "_date"] = csv_data[column_name].astype(str).apply(datetime.strptime,
                                                                                       args=(wgt_date_format.value,))


def process_columns(widget):
    try:
        for column in main_container.children:
            if isinstance(column, widgets.ContainerWidget):
                process_column(column)
        
        wgt_alert.value = success_alert
        wgt_alert.visible = True
    except Exception as ex:
        wgt_alert.value = error_alert % ex
        wgt_alert.visible = True


main_container = widgets.ContainerWidget()
display(main_container)

columns = []

wgt_alert = widgets.HTMLWidget(visible=False)
wgt_date_format = widgets.TextWidget(description="Text Date Format:" ,value="%Y%m%d")
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_columns)

main_container.children = (wgt_alert, wgt_date_format,)

for column in csv_data.columns:
    temp_container = widgets.ContainerWidget()
    
    main_container.children += (temp_container,)
    
    temp_container.remove_class('vbox')
    temp_container.add_class('hbox')
    temp_container.add_class('start')
    
    w1 = widgets.CheckboxWidget(value=True)
    w2 = widgets.TextWidget(value=column, disabled=True)
    w3 = widgets.DropdownWidget(values=["Float", "Int", "Ordinal Date", "Text Date", "Boolean", "Object"],
                                value = get_column_type(column))
    w4 = widgets.HTMLWidget()
    
    w2.set_css("width","200px")
    
    stats = get_stats(column)
    w4.value = "<pre>%s</pre>" % stats
    
    children = [w1, w2, w3, w4]

    temp_container.children = children

main_container.children += (wgt_process,)

Text Date Format:

ref: https://docs.python.org/2/library/datetime.html

Directive Meaning Example Notes
%a Weekday as locale’s abbreviated name.
Sun, Mon, ..., Sat (en_US);
So, Mo, ..., Sa (de_DE)
(1)
%A Weekday as locale’s full name.
Sunday, Monday, ..., Saturday (en_US);
Sonntag, Montag, ..., Samstag (de_DE)
(1)
%w Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. 0, 1, ..., 6  
%d Day of the month as a zero-padded decimal number. 01, 02, ..., 31  
%b Month as locale’s abbreviated name.
Jan, Feb, ..., Dec (en_US);
Jan, Feb, ..., Dez (de_DE)
(1)
%B Month as locale’s full name.
January, February, ..., December (en_US);
Januar, Februar, ..., Dezember (de_DE)
(1)
%m Month as a zero-padded decimal number. 01, 02, ..., 12  
%y Year without century as a zero-padded decimal number. 00, 01, ..., 99  
%Y Year with century as a decimal number. 1970, 1988, 2001, 2013  
%H Hour (24-hour clock) as a zero-padded decimal number. 00, 01, ..., 23  
%I Hour (12-hour clock) as a zero-padded decimal number. 01, 02, ..., 12  
%p Locale’s equivalent of either AM or PM.
AM, PM (en_US);
am, pm (de_DE)
(1), (2)
%M Minute as a zero-padded decimal number. 00, 01, ..., 59  
%S Second as a zero-padded decimal number. 00, 01, ..., 59 (3)
%f Microsecond as a decimal number, zero-padded on the left. 000000, 000001, ..., 999999 (4)
%z UTC offset in the form +HHMM or -HHMM (empty string if the the object is naive). (empty), +0000, -0400, +1030 (5)
%Z Time zone name (empty string if the object is naive). (empty), UTC, EST, CST  
%j Day of the year as a zero-padded decimal number. 001, 002, ..., 366  
%U Week number of the year (Sunday as the first day of the week) as a zero padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. 00, 01, ..., 53 (6)
%W Week number of the year (Monday as the first day of the week) as a decimal number. All days in a new year preceding the first Monday are considered to be in week 0. 00, 01, ..., 53 (6)
%c Locale’s appropriate date and time representation.
Tue Aug 16 21:30:00 1988 (en_US);
Di 16 Aug 21:30:00 1988 (de_DE)
(1)
%x Locale’s appropriate date representation.
08/16/88 (None);
08/16/1988 (en_US);
16.08.1988 (de_DE)
(1)
%X Locale’s appropriate time representation.
21:30:00 (en_US);
21:30:00 (de_DE)
(1)
%% A literal '%' character. %  

Feature Extraction

Text


In [6]:
success_alert = """
<div class="alert alert-success" role="alert">Text features extraction was successful.</div>
"""

error_alert = """
<div class="alert alert-danger" role="alert">Error in features extraction. %s.</div>
"""

def get_map_dict(column):
    keys = csv_data[column].unique()
    values = xrange(len(keys))
    return {key:value for key,value in zip(keys,values)}


def process_text_column(column):
    column_name = column.children[0].value
    text_process = column.children[1].value
    map_dict = eval(column.children[2].value)
    if text_process == "Map":
        csv_data[column_name + "_mapped"] = csv_data[column_name].map(map_dict)
    elif text_process == "Binary Vectorize":
        temp_dict = [{column_name: item} for item in csv_data[column_name]]
        vec = DictVectorizer(separator="_is_")
        vec_list = vec.fit_transform(temp_dict).toarray()
        columns = vec.get_feature_names()
        for counter in range(len(columns)):
            column = columns[counter]
            values = vec_list[:,counter]
            csv_data[column] = values


def process_text_columns(widget):
    try:
        for column in main_container.children:
            if isinstance(column, widgets.ContainerWidget):
                process_text_column(column)
        wgt_alert.value = success_alert
        wgt_alert.visible = True
    except Exception as ex:
        wgt_alert.value = error_alert % ex
        wgt_alert.visible = True

    
main_container = widgets.ContainerWidget()
display(main_container)

wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_text_columns)

main_container.children = (wgt_alert,)

columns = []

for column in csv_data.columns:
    if str(pd.Series(csv_data[column].values).dtype) == "object":
        temp_container = widgets.ContainerWidget()

        main_container.children += (temp_container,)

        temp_container.remove_class('vbox')
        temp_container.add_class('hbox')
        temp_container.add_class('start')

        w1 = widgets.TextWidget(value=column, disabled=True)
        w2 = widgets.DropdownWidget(values=["Map", "Binary Vectorize", "Don't Process"])
        w3 = widgets.TextWidget(description="Dict: {'m': 0, 'f': 1}")

        w1.set_css("width","200px")

        w3.value = str(get_map_dict(column))

        children = [w1, w2, w3]

        temp_container.children = children;

main_container.children += (wgt_process,)

Data Preview


In [8]:
def print_preview():
    print "Data Sample:"
    print csv_data.head(5)
    print ".\n" * 3
    print csv_data.tail(5)
    
print_preview()


Data Sample:
                                               Video                  Upload Time  Video length (minutes)  Views  \
0  1. Notebooks and Cells - IPython Notebook Tuto...       July 26, 2014 02:47 AM                    7.55    327   
1        4. NumPy Basics - IPython Notebook Tutorial     August 07, 2014 06:04 AM                   30.72    249   
2     Load Balancing - IPython Parallel Computing #1  September 10, 2014 01:58 AM                    8.57    244   
3     6. IPython Widgets - IPython Notebook Tutorial     August 10, 2014 10:59 AM                   10.07    234   
4    2. Markdown & LaTeX - IPython Notebook Tutorial       July 31, 2014 02:39 AM                   10.77    230   

   Estimated minutes watched  Average view duration (minutes)  Average percentage viewed  Subscriber views  \
0                        858                             2.62                      34.76                16   
1                       1829                             7.34                      23.91                26   
2                        712                             2.92                      34.06                 6   
3                        823                             3.52                      34.93                15   
4                        821                             3.57                      33.15                12   

   Subscriber minutes watched  Likes  Dislikes  Shares  Comments  Favorites  Subscribers    Upload Time_date  \
0                          63      3         1       2         2          0            8 2014-07-26 02:47:00   
1                         159      4         0       2         3          0            4 2014-08-07 06:04:00   
2                          10      6         0       1         3          1            4 2014-09-10 01:58:00   
3                          53      6         0       1         8          0            1 2014-08-10 10:59:00   
4                          66      3         1       3         2          1            3 2014-07-31 02:39:00   

   Video_mapped  Upload Time_mapped  Video_is_1. Notebooks and Cells - IPython Notebook Tutorial  \
0             0                   0                                                  1             
1             1                   1                                                  0             
2             2                   2                                                  0             
3             3                   3                                                  0             
4             4                   4                                                  0             

   Video_is_2. Markdown & LaTeX - IPython Notebook Tutorial  Video_is_3. Basic Python - IPython Notebook Tutorial  \
0                                                  0                                                         0      
1                                                  0                                                         0      
2                                                  0                                                         0      
3                                                  0                                                         0      
4                                                  1                                                         0      

   Video_is_4. NumPy Basics - IPython Notebook Tutorial  \
0                                                  0      
1                                                  1      
2                                                  0      
3                                                  0      
4                                                  0      

   Video_is_5. Plotting Charts with Matplotlib - IPython Notebook Tutorial  \
0                                                  0                         
1                                                  0                         
2                                                  0                         
3                                                  0                         
4                                                  0                         

   Video_is_6. IPython Widgets - IPython Notebook Tutorial  Video_is_7. Pandas - IPython Notebook Tutorial  \
0                                                  0                                                     0   
1                                                  0                                                     0   
2                                                  0                                                     0   
3                                                  1                                                     0   
4                                                  0                                                     0   

   Video_is_8. SymPy - IPython Notebook Tutorial  Video_is_Can computers DISCRIMINATE  against race and gender?  \
0                                              0                                                  0               
1                                              0                                                  0               
2                                              0                                                  0               
3                                              0                                                  0               
4                                              0                                                  0               

   Video_is_Container Widgets - IPython Widgets #1  Video_is_Folder Management - IPython Notebook Tips  \
0                                                0                                                  0    
1                                                0                                                  0    
2                                                0                                                  0    
3                                                0                                                  0    
4                                                0                                                  0    

   Video_is_Handling Events - IPython Widgets #2  Video_is_How does SVM work?  \
0                                              0                            0   
1                                              0                            0   
2                                              0                            0   
3                                              0                            0   
4                                              0                            0   

   Video_is_How does kNN (k-Nearest Neighbors) work?  \
0                                                  0   
1                                                  0   
2                                                  0   
3                                                  0   
4                                                  0   

   Video_is_Indian Elections 2014 - IPython Notebook Tutorial (Exercise)  \
0                                                  0                       
1                                                  0                       
2                                                  0                       
3                                                  0                       
4                                                  0                       

   Video_is_Load Balancing - IPython Parallel Computing #1  \
0                                                  0         
1                                                  0         
2                                                  1         
3                                                  0         
4                                                  0         

   Video_is_Machine Learning 1 - Setup Development Environment  Video_is_Machine Learning 2 - Introduction to ML  \
0                                                  0                                                           0   
1                                                  0                                                           0   
2                                                  0                                                           0   
3                                                  0                                                           0   
4                                                  0                                                           0   

   Video_is_Machine Learning 3 - Clustering  Video_is_Machine Learning 3 - Regression  \
0                                         0                                         0   
1                                         0                                         0   
2                                         0                                         0   
3                                         0                                         0   
4                                         0                                         0   

   Video_is_Styling Widgets - IPython Widgets #3  Video_is_Web DICOM Viewer  \
0                                              0                          0   
1                                              0                          0   
2                                              0                          0   
3                                              0                          0   
4                                              0                          0   

   Video_is_Widgets Alignment - IPython Widgets #4  
0                                                0  
1                                                0  
2                                                0  
3                                                0  
4                                                0  
.
.
.

                                                Video                  Upload Time  Video length (minutes)  Views  \
18               Handling Events - IPython Widgets #2  September 11, 2014 09:54 PM                    5.63     31   
19                                   Web DICOM Viewer        May 30, 2014 11:56 PM                    1.72     28   
20  Indian Elections 2014 - IPython Notebook Tutor...  September 06, 2014 03:08 PM                   30.58     23   
21               Styling Widgets - IPython Widgets #3  September 13, 2014 04:26 AM                    8.25     23   
22             Widgets Alignment - IPython Widgets #4  September 27, 2014 10:39 PM                    3.18      4   

    Estimated minutes watched  Average view duration (minutes)  Average percentage viewed  Subscriber views  \
18                         89                             2.87                      50.88                 4   
19                         32                             1.13                      71.10                10   
20                         90                             3.93                      12.86                 6   
21                         87                             3.79                      45.99                 9   
22                          5                             1.14                      35.68                 2   

    Subscriber minutes watched  Likes  Dislikes  Shares  Comments  Favorites  Subscribers    Upload Time_date  \
18                          12      2         0       2         2          1            1 2014-09-11 21:54:00   
19                          13      3         0       2         2          0            0 2014-05-30 23:56:00   
20                          21      3         0       1         1          0            2 2014-09-06 15:08:00   
21                          35      2         0       0         1          0            0 2014-09-13 04:26:00   
22                           1      2         0       0         1          0            0 2014-09-27 22:39:00   

    Video_mapped  Upload Time_mapped  Video_is_1. Notebooks and Cells - IPython Notebook Tutorial  \
18            18                  18                                                  0             
19            19                  19                                                  0             
20            20                  20                                                  0             
21            21                  21                                                  0             
22            22                  22                                                  0             

    Video_is_2. Markdown & LaTeX - IPython Notebook Tutorial  Video_is_3. Basic Python - IPython Notebook Tutorial  \
18                                                  0                                                         0      
19                                                  0                                                         0      
20                                                  0                                                         0      
21                                                  0                                                         0      
22                                                  0                                                         0      

    Video_is_4. NumPy Basics - IPython Notebook Tutorial  \
18                                                  0      
19                                                  0      
20                                                  0      
21                                                  0      
22                                                  0      

    Video_is_5. Plotting Charts with Matplotlib - IPython Notebook Tutorial  \
18                                                  0                         
19                                                  0                         
20                                                  0                         
21                                                  0                         
22                                                  0                         

    Video_is_6. IPython Widgets - IPython Notebook Tutorial  Video_is_7. Pandas - IPython Notebook Tutorial  \
18                                                  0                                                     0   
19                                                  0                                                     0   
20                                                  0                                                     0   
21                                                  0                                                     0   
22                                                  0                                                     0   

    Video_is_8. SymPy - IPython Notebook Tutorial  Video_is_Can computers DISCRIMINATE  against race and gender?  \
18                                              0                                                  0               
19                                              0                                                  0               
20                                              0                                                  0               
21                                              0                                                  0               
22                                              0                                                  0               

    Video_is_Container Widgets - IPython Widgets #1  Video_is_Folder Management - IPython Notebook Tips  \
18                                                0                                                  0    
19                                                0                                                  0    
20                                                0                                                  0    
21                                                0                                                  0    
22                                                0                                                  0    

    Video_is_Handling Events - IPython Widgets #2  Video_is_How does SVM work?  \
18                                              1                            0   
19                                              0                            0   
20                                              0                            0   
21                                              0                            0   
22                                              0                            0   

    Video_is_How does kNN (k-Nearest Neighbors) work?  \
18                                                  0   
19                                                  0   
20                                                  0   
21                                                  0   
22                                                  0   

    Video_is_Indian Elections 2014 - IPython Notebook Tutorial (Exercise)  \
18                                                  0                       
19                                                  0                       
20                                                  1                       
21                                                  0                       
22                                                  0                       

    Video_is_Load Balancing - IPython Parallel Computing #1  \
18                                                  0         
19                                                  0         
20                                                  0         
21                                                  0         
22                                                  0         

    Video_is_Machine Learning 1 - Setup Development Environment  Video_is_Machine Learning 2 - Introduction to ML  \
18                                                  0                                                           0   
19                                                  0                                                           0   
20                                                  0                                                           0   
21                                                  0                                                           0   
22                                                  0                                                           0   

    Video_is_Machine Learning 3 - Clustering  Video_is_Machine Learning 3 - Regression  \
18                                         0                                         0   
19                                         0                                         0   
20                                         0                                         0   
21                                         0                                         0   
22                                         0                                         0   

    Video_is_Styling Widgets - IPython Widgets #3  Video_is_Web DICOM Viewer  \
18                                              0                          0   
19                                              0                          1   
20                                              0                          0   
21                                              1                          0   
22                                              0                          0   

    Video_is_Widgets Alignment - IPython Widgets #4  
18                                                0  
19                                                0  
20                                                0  
21                                                0  
22                                                1  

Date


In [9]:
success_alert = """
<div class="alert alert-success" role="alert">Date features extraction was successful.</div>
"""

error_alert = """
<div class="alert alert-danger" role="alert">Error in date extraction. %s.</div>
"""

def process_date_column(column):
    column_name = column.children[0].value
    if column.children[1].value: # Year
        csv_data[column_name + "_year"] = csv_data[column_name].apply(lambda x: x.year)
    if column.children[2].value: # Month
        csv_data[column_name + "_month"] = csv_data[column_name].apply(lambda x: x.month)
    if column.children[3].value: # Day
        csv_data[column_name + "_day"] = csv_data[column_name].apply(lambda x: x.day)
    if column.children[4].value: # Day of week
        csv_data[column_name + "_dayofweek"] = csv_data[column_name].apply(lambda x: x.dayofweek)
    if column.children[5].value: # Hour
        csv_data[column_name + "_hour"] = csv_data[column_name].apply(lambda x: x.hour)
    if column.children[6].value: # Minute
        csv_data[column_name + "_minute"] = csv_data[column_name].apply(lambda x: x.minute)
    if column.children[7].value: # Second
        csv_data[column_name + "_second"] = csv_data[column_name].apply(lambda x: x.second)
    #if column.children[8].value: # Micro Second
    #    csv_data[column_name + "_microsecond"] = csv_data[column_name].apply(lambda x: datetime.microsecond)

def process_date_columns(widget):
    try:
        for column in main_container.children:
            if isinstance(column, widgets.ContainerWidget):
                process_date_column(column)
        wgt_alert.value = success_alert
        wgt_alert.visible = True
    except Exception as ex:
        wgt_alert.value = error_alert % ex
        wgt_alert.visible = True


main_container = widgets.ContainerWidget()
display(main_container)

wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_date_columns)

main_container.children = (wgt_alert,)

columns = []

for column in csv_data.columns:
    if str(pd.Series(csv_data[column].values).dtype) == "datetime64[ns]":
        temp_container = widgets.ContainerWidget()

        main_container.children += (temp_container,)

        temp_container.remove_class('vbox')
        temp_container.add_class('hbox')
        temp_container.add_class('start')

        w1 = widgets.TextWidget(value=column, disabled=True)
        w2 = widgets.CheckboxWidget(description="Year", value=True)
        w3 = widgets.CheckboxWidget(description="Month", value=True)
        w4 = widgets.CheckboxWidget(description="Day", value=True)
        w5 = widgets.CheckboxWidget(description="DayOfWeek", value=True)
        w6 = widgets.CheckboxWidget(description="Hour", value=True)
        w7 = widgets.CheckboxWidget(description="Minute", value=True)
        w8 = widgets.CheckboxWidget(description="Second", value=True)
        #w9 = widgets.CheckboxWidget(description="MS", value=True)

        w1.set_css("width","200px")

        children = [w1, w2, w3, w4, w5, w6, w7, w8]

        temp_container.children = children;

main_container.children += (wgt_process,)

In [10]:
print_preview()


Data Sample:
                                               Video                  Upload Time  Video length (minutes)  Views  \
0  1. Notebooks and Cells - IPython Notebook Tuto...       July 26, 2014 02:47 AM                    7.55    327   
1        4. NumPy Basics - IPython Notebook Tutorial     August 07, 2014 06:04 AM                   30.72    249   
2     Load Balancing - IPython Parallel Computing #1  September 10, 2014 01:58 AM                    8.57    244   
3     6. IPython Widgets - IPython Notebook Tutorial     August 10, 2014 10:59 AM                   10.07    234   
4    2. Markdown & LaTeX - IPython Notebook Tutorial       July 31, 2014 02:39 AM                   10.77    230   

   Estimated minutes watched  Average view duration (minutes)  Average percentage viewed  Subscriber views  \
0                        858                             2.62                      34.76                16   
1                       1829                             7.34                      23.91                26   
2                        712                             2.92                      34.06                 6   
3                        823                             3.52                      34.93                15   
4                        821                             3.57                      33.15                12   

   Subscriber minutes watched  Likes  Dislikes  Shares  Comments  Favorites  Subscribers    Upload Time_date  \
0                          63      3         1       2         2          0            8 2014-07-26 02:47:00   
1                         159      4         0       2         3          0            4 2014-08-07 06:04:00   
2                          10      6         0       1         3          1            4 2014-09-10 01:58:00   
3                          53      6         0       1         8          0            1 2014-08-10 10:59:00   
4                          66      3         1       3         2          1            3 2014-07-31 02:39:00   

   Video_mapped  Upload Time_mapped  Video_is_1. Notebooks and Cells - IPython Notebook Tutorial  \
0             0                   0                                                  1             
1             1                   1                                                  0             
2             2                   2                                                  0             
3             3                   3                                                  0             
4             4                   4                                                  0             

   Video_is_2. Markdown & LaTeX - IPython Notebook Tutorial  Video_is_3. Basic Python - IPython Notebook Tutorial  \
0                                                  0                                                         0      
1                                                  0                                                         0      
2                                                  0                                                         0      
3                                                  0                                                         0      
4                                                  1                                                         0      

   Video_is_4. NumPy Basics - IPython Notebook Tutorial  \
0                                                  0      
1                                                  1      
2                                                  0      
3                                                  0      
4                                                  0      

   Video_is_5. Plotting Charts with Matplotlib - IPython Notebook Tutorial  \
0                                                  0                         
1                                                  0                         
2                                                  0                         
3                                                  0                         
4                                                  0                         

   Video_is_6. IPython Widgets - IPython Notebook Tutorial  Video_is_7. Pandas - IPython Notebook Tutorial  \
0                                                  0                                                     0   
1                                                  0                                                     0   
2                                                  0                                                     0   
3                                                  1                                                     0   
4                                                  0                                                     0   

   Video_is_8. SymPy - IPython Notebook Tutorial  Video_is_Can computers DISCRIMINATE  against race and gender?  \
0                                              0                                                  0               
1                                              0                                                  0               
2                                              0                                                  0               
3                                              0                                                  0               
4                                              0                                                  0               

   Video_is_Container Widgets - IPython Widgets #1  Video_is_Folder Management - IPython Notebook Tips  \
0                                                0                                                  0    
1                                                0                                                  0    
2                                                0                                                  0    
3                                                0                                                  0    
4                                                0                                                  0    

   Video_is_Handling Events - IPython Widgets #2  Video_is_How does SVM work?  \
0                                              0                            0   
1                                              0                            0   
2                                              0                            0   
3                                              0                            0   
4                                              0                            0   

   Video_is_How does kNN (k-Nearest Neighbors) work?  \
0                                                  0   
1                                                  0   
2                                                  0   
3                                                  0   
4                                                  0   

   Video_is_Indian Elections 2014 - IPython Notebook Tutorial (Exercise)  \
0                                                  0                       
1                                                  0                       
2                                                  0                       
3                                                  0                       
4                                                  0                       

   Video_is_Load Balancing - IPython Parallel Computing #1  \
0                                                  0         
1                                                  0         
2                                                  1         
3                                                  0         
4                                                  0         

   Video_is_Machine Learning 1 - Setup Development Environment  Video_is_Machine Learning 2 - Introduction to ML  \
0                                                  0                                                           0   
1                                                  0                                                           0   
2                                                  0                                                           0   
3                                                  0                                                           0   
4                                                  0                                                           0   

   Video_is_Machine Learning 3 - Clustering  Video_is_Machine Learning 3 - Regression  \
0                                         0                                         0   
1                                         0                                         0   
2                                         0                                         0   
3                                         0                                         0   
4                                         0                                         0   

   Video_is_Styling Widgets - IPython Widgets #3  Video_is_Web DICOM Viewer  \
0                                              0                          0   
1                                              0                          0   
2                                              0                          0   
3                                              0                          0   
4                                              0                          0   

   Video_is_Widgets Alignment - IPython Widgets #4  Upload Time_date_month  Upload Time_date_day  \
0                                                0                       7                    26   
1                                                0                       8                     7   
2                                                0                       9                    10   
3                                                0                       8                    10   
4                                                0                       7                    31   

   Upload Time_date_dayofweek  Upload Time_date_hour  Upload Time_date_minute  
0                           5                      2                       47  
1                           3                      6                        4  
2                           2                      1                       58  
3                           6                     10                       59  
4                           3                      2                       39  
.
.
.

                                                Video                  Upload Time  Video length (minutes)  Views  \
18               Handling Events - IPython Widgets #2  September 11, 2014 09:54 PM                    5.63     31   
19                                   Web DICOM Viewer        May 30, 2014 11:56 PM                    1.72     28   
20  Indian Elections 2014 - IPython Notebook Tutor...  September 06, 2014 03:08 PM                   30.58     23   
21               Styling Widgets - IPython Widgets #3  September 13, 2014 04:26 AM                    8.25     23   
22             Widgets Alignment - IPython Widgets #4  September 27, 2014 10:39 PM                    3.18      4   

    Estimated minutes watched  Average view duration (minutes)  Average percentage viewed  Subscriber views  \
18                         89                             2.87                      50.88                 4   
19                         32                             1.13                      71.10                10   
20                         90                             3.93                      12.86                 6   
21                         87                             3.79                      45.99                 9   
22                          5                             1.14                      35.68                 2   

    Subscriber minutes watched  Likes  Dislikes  Shares  Comments  Favorites  Subscribers    Upload Time_date  \
18                          12      2         0       2         2          1            1 2014-09-11 21:54:00   
19                          13      3         0       2         2          0            0 2014-05-30 23:56:00   
20                          21      3         0       1         1          0            2 2014-09-06 15:08:00   
21                          35      2         0       0         1          0            0 2014-09-13 04:26:00   
22                           1      2         0       0         1          0            0 2014-09-27 22:39:00   

    Video_mapped  Upload Time_mapped  Video_is_1. Notebooks and Cells - IPython Notebook Tutorial  \
18            18                  18                                                  0             
19            19                  19                                                  0             
20            20                  20                                                  0             
21            21                  21                                                  0             
22            22                  22                                                  0             

    Video_is_2. Markdown & LaTeX - IPython Notebook Tutorial  Video_is_3. Basic Python - IPython Notebook Tutorial  \
18                                                  0                                                         0      
19                                                  0                                                         0      
20                                                  0                                                         0      
21                                                  0                                                         0      
22                                                  0                                                         0      

    Video_is_4. NumPy Basics - IPython Notebook Tutorial  \
18                                                  0      
19                                                  0      
20                                                  0      
21                                                  0      
22                                                  0      

    Video_is_5. Plotting Charts with Matplotlib - IPython Notebook Tutorial  \
18                                                  0                         
19                                                  0                         
20                                                  0                         
21                                                  0                         
22                                                  0                         

    Video_is_6. IPython Widgets - IPython Notebook Tutorial  Video_is_7. Pandas - IPython Notebook Tutorial  \
18                                                  0                                                     0   
19                                                  0                                                     0   
20                                                  0                                                     0   
21                                                  0                                                     0   
22                                                  0                                                     0   

    Video_is_8. SymPy - IPython Notebook Tutorial  Video_is_Can computers DISCRIMINATE  against race and gender?  \
18                                              0                                                  0               
19                                              0                                                  0               
20                                              0                                                  0               
21                                              0                                                  0               
22                                              0                                                  0               

    Video_is_Container Widgets - IPython Widgets #1  Video_is_Folder Management - IPython Notebook Tips  \
18                                                0                                                  0    
19                                                0                                                  0    
20                                                0                                                  0    
21                                                0                                                  0    
22                                                0                                                  0    

    Video_is_Handling Events - IPython Widgets #2  Video_is_How does SVM work?  \
18                                              1                            0   
19                                              0                            0   
20                                              0                            0   
21                                              0                            0   
22                                              0                            0   

    Video_is_How does kNN (k-Nearest Neighbors) work?  \
18                                                  0   
19                                                  0   
20                                                  0   
21                                                  0   
22                                                  0   

    Video_is_Indian Elections 2014 - IPython Notebook Tutorial (Exercise)  \
18                                                  0                       
19                                                  0                       
20                                                  1                       
21                                                  0                       
22                                                  0                       

    Video_is_Load Balancing - IPython Parallel Computing #1  \
18                                                  0         
19                                                  0         
20                                                  0         
21                                                  0         
22                                                  0         

    Video_is_Machine Learning 1 - Setup Development Environment  Video_is_Machine Learning 2 - Introduction to ML  \
18                                                  0                                                           0   
19                                                  0                                                           0   
20                                                  0                                                           0   
21                                                  0                                                           0   
22                                                  0                                                           0   

    Video_is_Machine Learning 3 - Clustering  Video_is_Machine Learning 3 - Regression  \
18                                         0                                         0   
19                                         0                                         0   
20                                         0                                         0   
21                                         0                                         0   
22                                         0                                         0   

    Video_is_Styling Widgets - IPython Widgets #3  Video_is_Web DICOM Viewer  \
18                                              0                          0   
19                                              0                          1   
20                                              0                          0   
21                                              1                          0   
22                                              0                          0   

    Video_is_Widgets Alignment - IPython Widgets #4  Upload Time_date_month  Upload Time_date_day  \
18                                                0                       9                    11   
19                                                0                       5                    30   
20                                                0                       9                     6   
21                                                0                       9                    13   
22                                                1                       9                    27   

    Upload Time_date_dayofweek  Upload Time_date_hour  Upload Time_date_minute  
18                           3                     21                       54  
19                           4                     23                       56  
20                           5                     15                        8  
21                           5                      4                       26  
22                           5                     22                       39  

Normalization


In [11]:
success_alert = """
<div class="alert alert-success" role="alert">Numbers processing was successful.</div>
"""

error_alert = """
<div class="alert alert-danger" role="alert">Error in numbers processing. %s.</div>
"""

def process_number_column(column):
    column_name = column.children[0].value
    number_process = column.children[1].value
    scale_min = column.children[2].value
    scale_max = column.children[3].value
    if number_process == "Scale":
        pre_process = preprocessing.MinMaxScaler(feature_range=(scale_min, scale_max))
        csv_data[column_name + "_scaled"] = pre_process.fit_transform(csv_data[[column_name]].astype(np.float64).values)
    elif number_process == "Standard Scaler":
        pre_process = preprocessing.StandardScaler()
        csv_data[column_name + "_standardscaler"] = pre_process.fit_transform(csv_data[column_name].copy().astype(np.float64))

def process_number_columns(widget):
    if True: #try:
        for column in main_container.children:
            if isinstance(column, widgets.ContainerWidget):
                process_number_column(column)
        wgt_alert.value = success_alert
        wgt_alert.visible = True
    else: #except Exception as ex:
        raise ex
        wgt_alert.value = error_alert % ex
        wgt_alert.visible = True

    
main_container = widgets.ContainerWidget()
display(main_container)

wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_number_columns)

main_container.children = (wgt_alert,)

columns = []

for column in csv_data.columns:
    data_type = str(pd.Series(csv_data[column].values).dtype)
    if data_type in ["float32", "float64", "int32", "int64"]:
        temp_container = widgets.ContainerWidget()

        main_container.children += (temp_container,)

        temp_container.remove_class('vbox')
        temp_container.add_class('hbox')
        temp_container.add_class('start')

        w1 = widgets.TextWidget(value=column, disabled=True)
        w2 = widgets.DropdownWidget(values=["Scale", "Standard Scaler", "Don't Process"], value="Don't Process")
        w3 = widgets.FloatTextWidget(description="Scale Min:", value=0)
        w4 = widgets.FloatTextWidget(description="Scale Max:", value=1)

        w1.set_css("width","200px")
        w3.set_css("width","50px")
        w4.set_css("width","50px")

        children = [w1, w2, w3, w4]

        temp_container.children = children;

main_container.children += (wgt_process,)

Visualize Preprocessing


In [13]:
def display_feature(feature_name):
    plt.figure()
    plt.scatter(list(csv_data.index), csv_data[feature_name])
    plt.grid()
    plt.ylabel(feature_name)
    plt.show()
    print "Mean: %4f" % np.mean(csv_data[feature_name])
    print "Std : %4f" % np.std(csv_data[feature_name])

wgt_column = widgets.SelectWidget(values=list(csv_data._get_numeric_data().columns))

interact(display_feature, feature_name=wgt_column);


Mean: 0.000000
Std : 1.000000

Further Preprocessing:

  • Missing Values:
    • Drop Columns (Features/Attributes)
    • Drop Rows (Samples/Data Points)
    • Impute Values (Mean/Median/Mode)
  • Denoising Time Series
  • Counting Vectorizer For Long Text

Save Dataset


In [14]:
csv_data.save("processed_data.csv")

In [15]:
joblib.dump(csv_data, "processed_data.pkl")


Out[15]:
['processed_data.pkl',
 'processed_data.pkl_01.npy',
 'processed_data.pkl_02.npy',
 'processed_data.pkl_03.npy',
 'processed_data.pkl_04.npy',
 'processed_data.pkl_05.npy']